Authors: Mauro Venticinque, Angelo Schillaci, Daniele Tambone

GitHub project: Bank-Marketing

Date: 2025-03-28

Introduction

Here we will write some information about the project.

1 Exploratory Data Analysis

datatable(head(train, 100), options = list(scrollX = TRUE))
str(train)
## 'data.frame':    32950 obs. of  22 variables:
##  $ X             : int  35248 39854 14530 27822 40199 21227 16836 39099 38565 38152 ...
##  $ age           : int  30 39 43 27 56 41 57 46 61 35 ...
##  $ job           : chr  "blue-collar" "technician" "services" "student" ...
##  $ marital       : chr  "married" "married" "single" "single" ...
##  $ education     : chr  "professional.course" "university.degree" "high.school" "high.school" ...
##  $ default       : chr  "no" "no" "no" "no" ...
##  $ housing       : chr  "no" "yes" "no" "yes" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "cellular" "cellular" "cellular" "cellular" ...
##  $ month         : chr  "may" "jun" "jul" "mar" ...
##  $ day_of_week   : chr  "fri" "mon" "tue" "thu" ...
##  $ duration      : int  1357 713 1317 80 230 697 1441 679 106 234 ...
##  $ campaign      : int  4 2 4 4 2 2 2 1 2 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  1 0 0 0 1 0 0 0 1 0 ...
##  $ poutcome      : chr  "failure" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  -1.8 -1.7 1.4 -1.8 -1.7 1.4 1.4 -3 -3.4 -3.4 ...
##  $ cons.price.idx: num  92.9 94.1 93.9 92.8 94.2 ...
##  $ cons.conf.idx : num  -46.2 -39.8 -42.7 -50 -40.3 -36.1 -42.7 -33 -26.9 -29.8 ...
##  $ euribor3m     : num  1.25 0.72 4.96 1.65 0.87 ...
##  $ nr.employed   : num  5099 4992 5228 5099 4992 ...
##  $ subscribed    : chr  "yes" "yes" "yes" "yes" ...
attach(train)

1.1 Variable descriptions

1.1.1 Bank client data:

  1. X (Integer): id of customer
  2. age (Integer): age of the customer
  3. job (Categorical): occupation
  4. marital (Categorical): marital status
  5. education (Categorical): education level
  6. default (Binary): has credit in default?
  7. housing (Binary): has housing loan?
  8. loan (Binary): has personal loan?
  9. contact (Categorical): contact communication type
  10. month (Categorical): last contact month of year
  11. day_of_week (Integer): last contact day of the week
  12. duration (Integer): last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y=‘no’). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model

1.1.2 Other attributes:

  1. campaign (Integer): number of contacts performed during this campaign and for this client (numeric, includes last contact)
  2. pdays (Integer): number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted)
  3. previous (Integer): number of contacts performed before this campaign and for this client
  4. poutcome (Categorical): outcome of the previous marketing campaign (categorical: ‘failure’,‘nonexistent’,‘success’)

1.1.3 Social and economic context attributes

  1. emp.var.rate (Integer): employment variation rate - quarterly indicator
  2. cons.price.idx (Integer): consumer price index - monthly indicator
  3. cons.conf.idx (Integer): consumer confidence index - monthly indicator
  4. euribor3m (Integer): euribor 3 month rate - daily indicator
  5. nr.employed (Integer): number of employees - quarterly indicator

1.1.4 Output variable (desired target)

  1. subscribed (Binary): has the client subscribed a term deposit?

Source: UCI Machine Learning Repository

vis_dat(train)

skim(train)
Data summary
Name train
Number of rows 32950
Number of columns 22
_______________________
Column type frequency:
character 11
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
job 0 1 6 13 0 12 0
marital 0 1 6 8 0 4 0
education 0 1 7 19 0 8 0
default 0 1 2 7 0 3 0
housing 0 1 2 7 0 3 0
loan 0 1 2 7 0 3 0
contact 0 1 8 9 0 2 0
month 0 1 3 3 0 10 0
day_of_week 0 1 3 3 0 5 0
poutcome 0 1 7 11 0 3 0
subscribed 0 1 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
X 0 1 20622.42 11882.00 1.00 10346.50 20629.50 30883.75 41188.00 ▇▇▇▇▇
age 0 1 40.04 10.45 17.00 32.00 38.00 47.00 98.00 ▅▇▃▁▁
duration 0 1 258.66 260.83 0.00 102.00 180.00 318.00 4918.00 ▇▁▁▁▁
campaign 0 1 2.57 2.77 1.00 1.00 2.00 3.00 43.00 ▇▁▁▁▁
pdays 0 1 961.90 188.33 0.00 999.00 999.00 999.00 999.00 ▁▁▁▁▇
previous 0 1 0.17 0.49 0.00 0.00 0.00 0.00 7.00 ▇▁▁▁▁
emp.var.rate 0 1 0.08 1.57 -3.40 -1.80 1.10 1.40 1.40 ▁▃▁▁▇
cons.price.idx 0 1 93.57 0.58 92.20 93.08 93.75 93.99 94.77 ▁▆▃▇▂
cons.conf.idx 0 1 -40.49 4.63 -50.80 -42.70 -41.80 -36.40 -26.90 ▅▇▁▇▁
euribor3m 0 1 3.62 1.74 0.63 1.34 4.86 4.96 5.04 ▅▁▁▁▇
nr.employed 0 1 5167.01 72.31 4963.60 5099.10 5191.00 5228.10 5228.10 ▁▁▃▁▇
plot_ly(train, x = subscribed, type = 'histogram')
corrplot(cor(train[, c("X", "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")]), method="pie")

plot_ly(train, x = job, y = age, type = 'box', color = job)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_ly(train, x = education, y = age, type = 'box', color = education)
ord_edu <- train %>% count(education) %>%arrange(n)%>% pull(education)


eduResp <- ggplot(train, aes(x = factor(education, levels = ord_edu), fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  coord_flip() +
  ylab("Proportion") +
  scale_fill_discrete(name = "Subscribed") +
  xlab("Education") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

eduFreq <- ggplot(as.data.frame(table(education)/sum(table(education))*100), aes(x = reorder(education, Freq), y = Freq)) +
  geom_bar(stat = "identity", color = "gray",  fill = "steelblue", alpha=0.9) +  
  coord_flip() +
  labs(title = "Education", x = "Education Level", y = "Count") +
  theme_minimal()

eduFreq / eduResp

ordine_poutcome <- train %>% count(poutcome) %>% arrange(n) %>%
  pull(poutcome)

poutcomeFreq <- ggplot(as.data.frame(table(train$poutcome) / length(train$poutcome) * 100),
                       aes(x = reorder(Var1, Freq), y = Freq, fill = Var1)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(
    title = "Distribution of Poutcome",
    x = "Outcome previous campaign",
    y = "Percentage (%)"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  theme(legend.position = "none")

poutcomeResp <- ggplot(train, aes(x = factor(poutcome, levels = ordine_poutcome), fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  coord_flip() +
  ylab("Proportion") +
  scale_fill_discrete(name = "Poutcome") +
  xlab("Outcome previous campaign") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal()

(poutcomeFreq / poutcomeResp) +
  plot_layout(axis_titles = 'collect')

ggplot(train, aes(age)) + geom_histogram(binwidth=4,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)

ordine_job <- train %>% count(job) %>%arrange(n)%>% pull(job)


jobFreq <- ggplot(as.data.frame(table(train$job) / length(train$job) * 100),
                   aes(x = reorder(Var1, Freq), y = Freq, fill = Var1)) +
  geom_bar(stat = "identity", color = "gray",  fill = "steelblue", alpha=0.9) +
  coord_flip() +
  labs(
    title = "Distribution of job",
    x = "Outcome of the previous campaign",
    y = "Percentage (%)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

jobResp <- ggplot(train, aes(x = factor(job, levels = ordine_job), fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  coord_flip() +
  labs(
    title = "Proportion by subscribed",
    x = "Default",
    y = "Proportion"
  ) +
  scale_fill_discrete(name = "Subscribed") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal()

jobFreq / jobResp

ggplot(train, aes(cons.price.idx)) + geom_histogram(binwidth=2,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)

ggplot(train, aes(cons.conf.idx)) + geom_histogram(binwidth=3,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)

ggplot(train, aes(euribor3m)) + geom_histogram(binwidth=3,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)

train$day_of_week <- factor(train$day_of_week,
                            levels = c("mon", "tue", "wed", "thu", "fri"),
                            ordered = TRUE)

dayFreq <- ggplot(as.data.frame(table(train$day_of_week)/length(train$day_of_week)*100), aes(x = Var1, y = Freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Distribution of Day of Week",
       x = "Last Contact Day of Week",
       y = "Percentage (%)")+
  theme_minimal()

dayResp <- ggplot(train, aes(x = day_of_week, fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  coord_flip() +
  ylab("Proportion") +
  scale_fill_discrete(name = "Subscribed") +
  xlab("Last Contact Day of Week") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal() 

(dayFreq / dayResp) +
  plot_layout(axis_titles = 'collect')

ordine_month<-factor(train$month, 
                     levels = c("mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"), 
                     ordered = TRUE)

monthFreq <- ggplot(as.data.frame(table(ordine_month)/length(ordine_month)*100), aes(x = ordine_month, y = Freq)) +
  geom_bar(stat = "identity",color='gray', fill = "steelblue") +
  coord_flip() +
  labs(title = "Distribution of month",
       x = "Last contact month of year",
       y = "Percentage (%)")+
  theme_minimal()

monthResp <- ggplot(train, aes(x = ordine_month, fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  coord_flip() +
  ylab("Proportion") +
  scale_fill_discrete(name = "month") +
  xlab("Last contact month of year") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal()

(monthFreq / monthResp) +
  plot_layout(axis_titles = 'collect')

ordine_previous <- train %>% count(previous) %>% arrange(n) %>%
  pull(previous) 

prevFreq <- ggplot(as.data.frame(table(train$previous)/length(train$previous)*100), aes(x = reorder(Var1,Freq), y = Freq)) +
  geom_bar(stat = "identity",color='gray', fill = "steelblue") +
  coord_flip() +
  labs(title = "Distribution of Previous",
       x = "Number of calls previous campain",
       y = "Percentage (%)")+
  theme_minimal()

prevResp <- ggplot(train, aes(x = factor(previous, levels = ordine_previous), fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  coord_flip() +
  ylab("Proportion") +
  scale_fill_discrete(name = "Previous") +
  xlab("Number of calls previous campain") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal()

(prevFreq / prevResp) +
  plot_layout(axis_titles = 'collect')

train$emp_cat <- ifelse(train$emp.var.rate < 0, "Negative", "Positive or Zero")


ordine_emp <- train %>% count(emp_cat) %>% arrange(n) %>%
  pull(emp_cat)


empFreq <- ggplot(as.data.frame(table(ordine_emp)/length(ordine_emp)*100), aes(x = ordine_emp, y = Freq)) +
  geom_bar(stat = "identity",color='gray', fill = "steelblue") +
  coord_flip() +
  labs(title = "Distribution of Employment Variation (±)",
       x = "Employment Variation (±)",
       y = "Percentage (%)")+
  theme_minimal()

empResp <- ggplot(train, aes(x = factor(emp_cat, levels = ordine_emp), fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  coord_flip() +
  ylab("Proportion") +
  scale_fill_discrete(name = "emp_cat") +
  xlab("Employment Variation (±)") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal()

(empFreq / empResp) +
  plot_layout(axis_titles = 'collect')

ggpairs(train[, c("age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")], columns = 1:10, 
                 lower = list(continuous = wrap("points", alpha = 0.5, color = "darkred", size=0.5)),
                 title='Scatterplot', axisLabels='none')